input_field: text
filters:
  # The filters below define a chain of heuristic filters to be applied to each document in a corpus.
  # This particular cascade of filters is intended to filter English language data.
  # The filter listed at the top will be applied first, and the following filters will be applied in
  # the order they appear in this file. Each filter can be removed and re-ordered as desired.
  - name: nemo_curator.filters.heuristic_filter.NonAlphaNumericFilter
    params:
      max_non_alpha_numeric_to_text_ratio: 0.25
  - name: nemo_curator.filters.heuristic_filter.SymbolsToWordsFilter
    params:
      max_symbol_to_word_ratio: 0.1
  - name: nemo_curator.filters.heuristic_filter.NumbersFilter
    params:
      max_number_to_text_ratio: 0.15
  - name: nemo_curator.filters.heuristic_filter.UrlsFilter
    params:
      max_url_to_text_ratio: 0.2
  - name: nemo_curator.filters.heuristic_filter.WhiteSpaceFilter
    params:
      max_white_space_ratio: 0.25
  - name: nemo_curator.filters.heuristic_filter.ParenthesesFilter
    params:
      max_parentheses_ratio: 0.1
  - name: nemo_curator.filters.heuristic_filter.BoilerPlateStringFilter
    params:
      remove_if_at_top_or_bottom: True
      max_boilerplate_string_ratio: 0.4
  - name: nemo_curator.filters.heuristic_filter.RepeatedLinesFilter
    params:
      max_repeated_line_fraction: 0.7
  - name: nemo_curator.filters.heuristic_filter.RepeatedParagraphsFilter
    params:
      max_repeated_paragraphs_ratio: 0.7
  - name: nemo_curator.filters.heuristic_filter.RepeatedLinesByCharFilter
    params:
      max_repeated_lines_char_ratio: 0.8
  - name: nemo_curator.filters.heuristic_filter.RepeatedParagraphsByCharFilter
    params:
      max_repeated_paragraphs_char_ratio: 0.8
  - name: nemo_curator.filters.heuristic_filter.WordCountFilter
    params:
      min_words: 50
      max_words: 100000
  - name: nemo_curator.filters.heuristic_filter.PunctuationFilter
    params:
      max_num_sentences_without_endmark_ratio: 0.85
  - name: nemo_curator.filters.heuristic_filter.WordsWithoutAlphabetsFilter
    params:
      min_words_with_alphabets: 0.8
  - name: nemo_curator.filters.heuristic_filter.CommonEnglishWordsFilter
    params:
      min_num_common_words: 2
      stop_at_false: True
  - name: nemo_curator.filters.heuristic_filter.MeanWordLengthFilter
    params:
      max_mean_word_length: 10
      min_mean_word_length: 3
  - name: nemo_curator.filters.heuristic_filter.LongWordFilter
    params:
      max_word_length: 1000
  - name: nemo_curator.filters.heuristic_filter.EllipsisFilter
    params:
      max_num_lines_ending_with_ellipsis_ratio: 0.3
  # Top N-Gram filters for N-grams 2, 3, and 4
  - name: nemo_curator.filters.heuristic_filter.RepeatingTopNGramsFilter
    params:
      n: 2
      max_repeating_ngram_ratio: 0.2
  - name: nemo_curator.filters.heuristic_filter.RepeatingTopNGramsFilter
    params:
      n: 3
      max_repeating_ngram_ratio: 0.18
  - name: nemo_curator.filters.heuristic_filter.RepeatingTopNGramsFilter
    params:
      n: 4
      max_repeating_ngram_ratio: 0.16
  # Duplicate N-gram filters for N-grams 5, 6, 7, 8, 9, and 10
  - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter
    params:
      n: 5
      max_repeating_duplicate_ngram_ratio: 0.15
  - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter
    params:
      n: 6
      max_repeating_duplicate_ngram_ratio: 0.14
  - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter
    params:
      n: 7
      max_repeating_duplicate_ngram_ratio: 0.13
  - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter
    params:
      n: 8
      max_repeating_duplicate_ngram_ratio: 0.12
  - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter
    params:
      n: 9
      max_repeating_duplicate_ngram_ratio: 0.11
  - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter
    params:
      n: 10
      max_repeating_duplicate_ngram_ratio: 0.10
  - name: nemo_curator.filters.heuristic_filter.BulletsFilter
    params:
      max_bullet_lines_ratio: 0.9
